*****************************************************************
*****************************************************************
*****                                                       *****
*****        Mona Morgan-Collins (Durham University)        *****
*****       Contact: mona.morgan-collins@durham.ac.uk       *****
*****                                                       *****
*****  The Electoral Impact of Newly Enfranchised Groups:   *****
*****  The Case of Women’s Suffrage in the United States.   *****
*****                                                       *****
*****                 Journal of Politics                   *****
*****                                                       *****
*****           Building replication data sets              *****
*****                                                       *****
*****************************************************************
*****************************************************************




*Stata version 15.1
cd "./MMC_repl" /*set your working directory*/ 

*install the following packages: 
ssc install coefplot
ssc install cibar
ssc install labutil

*This file builds seven final data sets that are used for replication analyses 
	*MMC_Illinois replicates robustness analyses with Illinois data (Figures 3, 4, A23; Tables A13, A14)
	*MMC_NAWSA replicates Figure A5
	*MMC_1920 for the main analysis that uses 1918 and 1920 election data
	*MMC_1918 for several robustness tests that use 1918 and 1916 election data
	*MMC_1922 for several robustness tests that use 1922 and 1920 election data
	*MMC_1922long replicated Figure A8
	*MMC_1920west replicates Figure A22
	*MMC_dwprog replicates Figure A4
	*MMC_shifts replicates Figures A16 & A17
	
*Note: All commands below need to be run sequentially to ensure that created data sets earlier can be used for later commands.




*****************************************************
*****************************************************
**         CREATING ILLINOIS DATA SET              **
*****************************************************
*****************************************************

import excel "./MMC_raw.xlsx", sheet("Illinois") firstrow case(lower) clear

*generating variables
gen awomen20 = (f21/(m21+f21))*100                    //percent of adult women in 1920
gen wshare16 = (votw16/(votm16+votw16))*100           //women's share of overall turnout in 1916
gen turn12 = (vot12/m21)*100                          //overall (men's) turnout in 1912
gen turnw16 = (votw16/f21)*100                        //women's turnout in 1916
gen turnm16 = (votm16/m21)*100                        //men's turnout in 1916
gen turngap16 = turnw16-turnm16                       //gap between women's and men's turnout in 1916
gen dturnm1612 = turnm16-turn12                       //change in male turnout between 1916 and 1912
gen dem12 = (votd12/vot12)*100                        //overall (men's) support for Democrats in 1912
gen demm16 = (votdm16/votm16)*100                     //men's support for Democrats in 1916
gen demw16 = (votdw16/votw16)*100                     //women's support for Democrats in 1916
gen demgap16 = demw16-demm16                          //gap between women's and men's support for Democrats in 1916
gen ddemm1612 = demm16-dem12                          //change in male support for Democrats between 1916 and 1912

save "./MMC_illinois.dta" 




*****************************************************
*****************************************************
**            CREATING NAWSA DATA SET              **
*****************************************************
*****************************************************

*importing & cleaning icpsr state codes
import excel "._raw.xlsx", sheet("icpsr_state_code") firstrow case(lower) clear
save "./icpsr_state_code.dta" 

*importing & cleaning nawsa data
import excel "./MMC_raw.xlsx", sheet("nawsa") firstrow case(lower) clear
rename state state_name                               //preparing for merger
merge m:1 state_name using "./icpsr_state_code.dta" 
drop _merge
gen sample = 0                                         //identifying states in the sample
foreach i in  1 2 3 4 5 6 11 12 14 12 14 21 22 23 24 34 25 31 33 34 35 36 37 40 41 42 43 45 47 48 49 51 52 53 54 56 66 {
replace sample =1 if  state==`i'
}
labmask state, values(state_name)                      //preparing data for xtline in Figure A5 
xtset state yr                                         
save "./MMC_nawsa.dta" 




*****************************************************
*****************************************************
**             CREATING 1920 DATA SET              **
*****************************************************
*****************************************************


*****************************************************
*Creating Stata data set with election data 
*****************************************************

*importing & cleaning data on elections
import excel "./MMC_raw.xlsx", sheet("h_election") firstrow clear 
gen sample = 0                                         //identifying states in the sample
foreach i in  1 2 3 4 5 6 11 12 14 12 14 21 22 23 24 34 25 31 33 34 35 36 37 40 41 42 43 45 47 48 49 51 52 53 54 56 66 {
replace sample =1 if  state==`i'
}
keep if sample==1                                      //keeping states in the sample only
drop sample
label var state  "ICPSR state code"
label var county "ICPSR county code"
keep state state_name county county_name cd20 rep20 dem20 turn20 cd18 rep18 dem18 turn18 prog12 //keeping only relevant variables
replace cd20=1 if cd20==98 & state==66                 //spotted different coding of states with a single at-large district [preparing for merger with bills66.dta, generated below]
replace cd18=1 if cd18==98 & state==66                 //spotted different coding of states with a single at-large district [preparing for merger with bills66.dta, generated below]

*generating variables
egen cd20_u = group(state cd20)                        //creating unique district identifiers
egen cd18_u = group(state cd18)
gen redistcd2018 = 0                                   //identifing counties which changed districts between 1920 and 1918 with a dummy [none]
replace redistcd2018 = 1 if cd20>cd18
replace redistcd2018 = 1 if cd20<cd18
gen dturn2018 = turn20-turn18                          //change in turnout between 1920 and 1918
foreach var of varlist rep18 dem18 {                   //county-level margin between Republican and Democratic candidates in 1918
gen `var'_0 = `var'
replace `var'_0 = 0 if `var'_0==.
}
gen margin18 = round(abs(dem18_0-rep18_0), 0.1) 
gen third18 = 100-rep18-dem18                          //support of third parties
gen third20 = 100-rep20-dem20
gen prog12_01=0
replace prog12_01= (prog12>0) if !missing(prog12)      //dummy for counties in districts with a progressive candidate in 1912 

save "./MMC_1920.dta"

*creating dummy variables for Rep/Dem entry/withdrawal between 1920 & 1918
preserve                                               //temporarily preserve to collapse at district level
gen sample = 0                                         //identifying states in the sample
foreach i in  1 2 3 4 5 6 11 12 14 12 14 21 22 23 24 34 25 31 33 34 35 36 37 40 41 42 43 45 47 48 49 51 52 53 54 56 66 {
replace sample =1 if  state==`i'
}
keep if sample==1                                      //keeping states in the sample only
drop sample
collapse (mean) rep18 dem18 rep20 dem20, by(cd20_u)
gen crep20= (rep20>0) if !missing(rep20)               //generating dummies for candidacy 
replace crep20 = 0 if crep20==.
gen cdem20= (dem20>0) if !missing(dem20)        
replace cdem20 = 0 if cdem20==.
gen crep18= (rep18>0) if !missing(rep18)        
replace crep18 = 0 if crep18==.
gen cdem18= (dem18>0) if !missing(dem18)        
replace cdem18 = 0 if cdem18==.              
gen dcr2018 = crep20-crep18                            //generating change in candidacy 
gen dcd2018 = cdem20-cdem18                           
gen wrd20 = 0                                          //generating withdrawal dummy
replace wrd20=1 if dcd2018==-1
replace wrd20=1 if dcr2018==-1
gen nrd20 = 0                                          //generating withdrawal dummy
replace nrd20=1 if dcd2018==1
replace nrd20=1 if dcr2018==1
keep cd20_u wrd20 nrd20                                //creating data set to be merged with MMC_1920
save "./cd20.dta"
restore                                                //restoring to pre-collapsed data set at county level

*merging with master
use "./MMC_1920.dta"
merge m:1 cd20_u using "./cd20.dta"
drop _merge

save "./MMC_1920.dta" , replace


*****************************************************
*Adding votes on progressive bills in 66th Congress
*****************************************************

*importing & cleaning data on bills and incumbent party
import excel "./MMC_raw.xlsx", sheet("h_bills 66th") firstrow clear  //importing raw data
gen sample = 0                                               //identifying states in the sample
foreach i in  1 2 3 4 5 6 11 12 14 12 14 21 22 23 24 34 25 31 33 34 35 36 37 40 41 42 43 45 47 48 49 51 52 53 54 56 66 {
replace sample =1 if  state==`i'
}
keep if sample==1                                            //keeping states in the sample
drop sample
foreach var of varlist V2 - V321 {                           //dropping districts where incumbent was a non-member on at least one of the bills, i.e. excluding incumbents who did not serve full term]
drop if `var' == 0
}	 
recode V2-V321 (2 3 4 5 6 7 8 9 = 0)                         //identifying `yea' votes casted in person
rename dist cd20                                             //preparing for merger: to distinquish between districts of different congresses
rename party pty66                                           //preparing for merger: to distinquish between incumbents of different congresses 
foreach var of varlist V2 - V321 {                           //preparing for merger: to distinquish between bills from different congresses
rename `var' `var'_66
}
save "./bills66.dta"

*merging with master
use "./MMC_1920.dta"                        
merge m:1 state cd20 using "./bills66.dta"                    //[54 districts in bills66.dta not matched [missing in h_election]; 240 counties in MMC_1920.dta not matched [these refer to counties that either do map into district boundaries - do not have a single assigned district in h_election - OR to counties that belong to districts that were excluded above for having incumbents who did not `serve full terms'].
drop if _merge==2                                             //dropping unmatched districts [see note above]  
drop _merge

*cleaning data
label var pty66 "ICPSR party code 66th Congress"
label var V2_66   "Suffrage"                                  //labelling salient bills
label var V201_66 "Women's Bureau"
label var V43_66  "Prohibition"
label var V141_66 "Veteran Pensions"
label var V207_66 "Civil service"
label var V238_66 "WW Compensation"
label var V250_66 "Immigration"
label var V104_66 "Smith-Fess"
label define yn 0 "No" 1 "Yes"
label values V2_66-V321_66 yn 

*generating variables
gen progsc66 = ((V2_66+V17_66+V33_66+V35_66+V41_66+V43_66+V96_66+V103_66+V104_66+V112_66+V141_66+V153_66+V170_66+V201_66+V207_66+V211_66+V215_66+V228_66+V238_66+V250_66+V310_66+V321_66)/22)*100 //progressive score for 66th congress; list of bills in Appendix Table 3 and `progressive' sheet in MMC_raw.xls
gen progsc66I = ((V2_66+V17_66+V33_66+V35_66+V41_66+V43_66+V96_66+V104_66+V112_66+V141_66+V153_66+V170_66+V201_66+V207_66+V211_66+V215_66+V228_66+V238_66+V310_66+V321_66)/20)*100                //alternative progressive score in 66th congress [for robustness]
gen progsc66P = ((V2_66+V17_66+V33_66+V35_66+V96_66+V103_66+V104_66+V141_66+V153_66+V170_66+V201_66+V207_66+V211_66+V215_66+V228_66+V238_66+V250_66+V310_66+V321_66)/19)*100                      //alternative progressive score in 66th congress [for robustness]           
gen dinc2018 = rep20 - rep18 if pty66==200                   //change in incumbent support between 1920 and 1918
replace dinc2018 = dem20 - dem18 if pty66==100
gen v66_20 = rep20 if pty66==200                             //highest Rep/Dem vote [used for robustness and identification of `uncontested' counties]
replace v66_20 = dem20 if pty66==100
gen v66_18 = rep18 if pty66==200
replace v66_18 = dem18 if pty66==100

save "./MMC_1920.dta", replace


*****************************************************
*Adding data on incumbents 66th Congress
*****************************************************

*importing & cleaning data on incumbents
import excel "./MMC_raw.xlsx", sheet("h_candidates") firstrow clear                      //importing raw data
gen split = strpos(CANDIDATES_NAME, ",")                                                 //keeping only last names, so that candidates can be matched across years [original data has inconsistent information on first names across years]
generate str1 nameshort = ""
replace nameshort = substr(CANDIDATES_NAME,1,split - 1)
drop split
egen id_can  = group(ICPSR_STATE_CODE CONGRESSIONAL_DIST_NO nameshort)                   //generating candidate id
egen id_canyr = group(ICPSR_STATE_CODE CONGRESSIONAL_DIST_NO nameshort YEAR_OF_ELECTION) //generating candidate-year id
by id_can (YEAR_OF_ELECTION), sort: gen incrun_01 = ICPSR_PARTY_CODE[_n]==ICPSR_PARTY_CODE[_n+1] if !missing(ICPSR_PARTY_CODE) & ELECTION_OUTCOME==1   //this identifies whether elected candidate (id_can) run in the next election for the same party	   
replace incrun_01 = . if YEAR_OF_ELECTION==1922                                          //data set ends in 1922, do not know whether 1922 candidate run in 1924
by id_can (YEAR_OF_ELECTION), sort: gen incrun2_01 = ICPSR_PARTY_CODE[_n]==ICPSR_PARTY_CODE[_n+2] if !missing(ICPSR_PARTY_CODE) & ELECTION_OUTCOME==1   //this identifies whether elected candidate (id_can) run in the second next election for the same party	   
replace incrun2_01 = . if YEAR_OF_ELECTION==1922                                         //data set ends in 1922, do not know whether 1922 candidate run in 1926
replace incrun2_01 = . if YEAR_OF_ELECTION==1920                                         //data set ends in 1922, do not know whether 1920 candidate run in 1924
xtset id_can YEAR_OF_ELECTION                                                            //xtsetting data such that L. (lags) can be used
gen incrun_01L = L2.incrun_01                                                            //1 indicates that incumbent run in the last election AND run for the same party; 0 indicates that candidate run in the last election BUT for a different party
gen incrun2_01L = L4.incrun2_01                                                          //1 indicates that incumbent run in the second last election AND run for the same party 
egen id_distyr = group(ICPSR_STATE_CODE CONGRESSIONAL_DIST_NO YEAR_OF_ELECTION)          //generating district-year id
egen incrun = max(incrun_01L), by(id_distyr)                                             //using only the maximum value (1) in each district-year [this allows keeping only one entry per district-year - see below]
egen incrun2 = max(incrun2_01L), by(id_distyr)                                           //using only the maximum value (1) in each district-year [this allows keeping only one entry per district-year - see below]
keep if ELECTION_OUTCOME==1                                                              //keeping only winners in each district-year [one entry per district-year]
drop if YEAR_OF_ELECTION==1916                                                           //1916 was needed to compute values for 1918
replace CONGRESSIONAL_DIST_NO=1 if CONGRESSIONAL_DIST_NO==98 & ICPSR_STATE_CODE==66      //spotted different coding of states with a single at-large district [preparing for merger with master]
replace CONGRESSIONAL_DIST_NO=1 if CONGRESSIONAL_DIST_NO==98 & ICPSR_STATE_CODE==11      //spotted different coding of states with a single at-large district [preparing for merger with master]
replace CONGRESSIONAL_DIST_NO=1 if CONGRESSIONAL_DIST_NO==98 & ICPSR_STATE_CODE==68      //spotted different coding of states with a single at-large district [preparing for merger with master]
replace CONGRESSIONAL_DIST_NO=1 if CONGRESSIONAL_DIST_NO==98 & ICPSR_STATE_CODE==65      //spotted different coding of states with a single at-large district [preparing for merger with master]
replace CONGRESSIONAL_DIST_NO=1 if CONGRESSIONAL_DIST_NO==98 & ICPSR_STATE_CODE==61      //spotted different coding of states with a single at-large district [preparing for merger with master]
rename ICPSR_STATE_CODE state                                                            //preparing for merger with master
rename YEAR_OF_ELECTION   year
gen cd20 = CONGRESSIONAL_DIST_NO if year==1920 
gen cd22 = CONGRESSIONAL_DIST_NO if year==1922 
gen cd18 = CONGRESSIONAL_DIST_NO if year==1918
 
save "./cand.dta"

*extracting relevant data for MMC_1920
use "./cand.dta"
preserve
gen sample = 0                                               //identifying states in the sample
foreach i in  1 2 3 4 5 6 11 12 14 12 14 21 22 23 24 34 25 31 33 34 35 36 37 40 41 42 43 45 47 48 49 51 52 53 54 56 66 {
replace sample =1 if  state==`i'
}
keep if sample==1
keep if year==1920
keep cd20  incrun  state 
rename incrun incrun20
save "./cand66.dta"
restore

*merging with master
use "./MMC_1920.dta"
merge m:1 state cd20 using "./cand66.dta"  //61 districts in cand66.dta not matched; 110 counties in MMC_1920.dta not matched [these observations contain districts that do not have any counties that map into district boundaries.]
drop if _merge==2                                            //dropping unmatched districts [see note above] 
drop _merge	
	
save "./MMC_1920.dta" , replace


*****************************************************
*Adding census data
*****************************************************

*importing & cleaning census data
import excel "./MMC_raw.xlsx", sheet("census") firstrow clear 
gen sample = 0                                               //identifying states in the sample
foreach i in  1 2 3 4 5 6 11 12 14 12 14 21 22 23 24 34 25 31 33 34 35 36 37 40 41 42 43 45 47 48 49 51 52 53 54 56 66 {
replace sample =1 if  state==`i'
}
keep if sample==1                                            //keeping states in the sample
drop sample
label var f21 "Female population 21+"
label var m21 "Male population 21+"
label var negm21 "Negro males 21+"
label var negf21 "Negro females 21+"
label var fillit21 "Illiterate females 21+"
label var millit21 "Illiterate males 21+"
label var fbnawf21 "Foreign naturalized white females 21+"
label var fbnawm21 "Foreign naturalized white males 21+"
label var fbalwf21 "Foreign alien white females 21+"
label var fbalwm21 "Foreign alien white males 21+"
label var fbfpwf21 "Foreign white females/first papers 21+"
label var fbfpwm21 "Foreign white males/first papers 21+"
label var totpop "Total Population 1920"
label var urb25 "Population cities 25K+ 1920"
label var mfgout "Value manufacturing output"
label var region2 "U.S. Census Region (5)"

*generating variables
gen awomen20 = (f21/(m21+f21))*100                          //proportion of adult women      
gen ablack20 =((negm21+negf21)/(m21+f21))*100               //proportion of adult black
gen urb20 = (urb25/totpop)*100                              //proportion urban
gen mfo20_m = mfgout/1000000                                //manufacturing output in millions
gen anat20 = ((fbnawf21+fbnawm21)/(f21+m21))*100            //proportion of naturalized

save "./census.dta"

*merging with master
use "./MMC_1920.dta"
merge 1:1 county using "./census.dta"        
drop _merge

save "./MMC_1920.dta" , replace


*****************************************************
*Adding data on dry counties in 1918
*****************************************************

*importing & cleaning data
import excel "./MMC_raw.xlsx", sheet("dry") firstrow clear 
replace ICPSRCNTY = subinstr(ICPSRCNTY," ","",.)               //removing empty spaces
egen tempmerge=concat(ICPSRCNTY AMBIGCHK)                      //generating unique identifier for merge             
gen sample = 0                                                 //identifying states in the sample
foreach i in  1 2 3 4 5 6 11 12 14 12 14 21 22 23 24 34 25 31 33 34 35 36 37 40 41 42 43 45 47 48 49 51 52 53 54 56 66 {
replace sample =1 if  ICPSRSTATE==`i'
}
keep if sample==1                                              //keeping states in the sample 
drop sample
rename ICPSRSTATE state                                        //preparing for merge
save "./dry.dta"

*merging with master
use "./MMC_1920.dta"
gen ICPSRCNTY = substr(county_name, 1, 4)                      //preparing county identifier for merger
replace ICPSRCNTY = subinstr(ICPSRCNTY," ","",.) 
replace ICPSRCNTY = "OBRI" if state==31 & ICPSRCNTY=="OBR"     //enabling this county to be merged [correcting for different spelling in using and master]
duplicates tag state ICPSRCNTY, gen(amb)                       //identifies counties in one state with same ICPSRCNTY 
replace amb = 1 if amb>1                             
egen ambid = group(ICPSRCNTY amb state)  if amb==1             //identifies counties in one state with same ICPSRCNTY with a unique identifier 
bysort ambid (county_name): gen amb_rank = [_n] if amb==1      //sorting alphabetically within unique identifiers
replace amb_rank=0 if amb_rank==.
egen tempmerge=concat(ICPSRCNTY amb_rank)                      //creates identifier for merger
merge 1:1 state tempmerge using "./dry.dta"                    //matching 1920 counties with 1918 counties; 13 counties not matched in dry.dta [missing]; 40 counties not matched in MMC_1920, 39 of which corrected below [these are 1918 counties that did not exist in 1920 or missing]
drop if _merge==2                                              //dropping observations that are not in master 
drop _merge  ICPSRCNTY amb ambid amb_rank tempmerge AMBIGCHK                                 
replace PROH1918 = 9 if PROH1918==. & state==22                //replacing known missing values if entire state was dry
replace PROH1918 = 9 if PROH1918==. & state==37 
replace PROH1918 = 9 if PROH1918==. & state==40 

*generating variables
gen proh18_01= (PROH1918>0) if !missing(PROH1918) 

save "./MMC_1920.dta" , replace


*****************************************************
*Adding data on women's employment
*****************************************************

*importing data
import excel "./MMC_raw.xlsx", sheet("employed") firstrow  clear 

*generating variables
gen fempl20 = (fwork_abs /(mwork_abs + fwork_abs))*100      //generating women's labour force
save "./empl.dta"

*merging with master
use "./MMC_1920.dta"
merge m:1 state using "./empl.dta"
drop _merge

save "./MMC_1920.dta" , replace


*****************************************************
*Adding data on redistricting
*****************************************************

*importing data
import excel state_name county_name_r redist2018 using "./MMC_raw.xlsx", sheet("redistricting") cellrange(A2:C43)  clear 
save "./redist2018.dta"

*merging with master
use "./MMC_1920.dta"
gen county_name_r = county_name                              //enabling merger [renaming county_names that have the same name in the same state]
replace county_name_r = "VERMILLION2" if county==451150
replace county_name_r = "POINTE COUPEE 2" if county==450790
merge 1:1 state_name county_name_r using "./redist2018.dta"  //[42 counties were matched, these refer to redistricted between 1920 and 1918] 
drop _merge county_name_r

*cleaning data
replace redist2018=0 if redist2018==.  
       
save "./MMC_1920.dta" , replace


*****************************************************
*Adding data on voter restrictions
*****************************************************

*importing data
import excel "./MMC_raw.xlsx", sheet("restrictions") firstrow  clear 
save "./restr.dta"

*merging with master
use "./MMC_1920.dta"
merge m:1 state using "./restr.dta"
drop _merge

*generating variables
gen awomen20_lit = ((f21-fillit21)/(m21+f21-millit21-fillit21) )*100                           //adult literate  women
gen awomen20_nb  = ((f21-negf21)/(f21+m21-negf21-negm21))*100                                  //adult non-black women
gen awomen20_na = ((f21-fbalwf21-fbfpwf21)/(f21+m21-fbalwf21-fbalwm21-fbfpwf21-fbfpwm21))*100  //adult non-alien women [excludes all aliens]
gen awomen20_naf = ((f21-fbalwf21)/(f21+m21-fbalwf21-fbalwm21))*100                            //adult non-alien women [does not exclude declarant aliens]
gen awomen20_litl = awomen20
replace awomen20_litl = awomen20_lit if litt==1         //adult literate women if literacy tests
gen awomen20_nbp = awomen20
replace awomen20_nbp = awomen20_nb   if pollt==1        //adult non-black women if poll taxes
gen awomen20_nbr = awomen20
replace awomen20_nbr = awomen20_nb   if registr==1      //adult non-black women if restrictive registration
gen awomen20_naa = awomen20_naf                         //adult non-alien women [does not exclude declarant aliens]
replace awomen20_naa = awomen20_na   if alien==1        //adult non-alien women [excludes declarant aliens] if declarant aliens without suffrage

save "./MMC_1920.dta" , replace


*****************************************************
*Adding data on suffragists
*****************************************************

*importing data
import excel "./MMC_raw.xlsx", sheet("suffragists") firstrow  clear 
save "./suff.dta"

*merging with master
use "./MMC_1920.dta"
merge m:1 state using "./suff.dta"
drop _merge
             
save "./MMC_1920.dta" , replace


*****************************************************
*Adding data on suffrage movement strength
*****************************************************

*extracting and cleaning data from MMC_nawsa.dta [created above]
use "./MMC_nawsa.dta"                                                 //[created above]
preserve   
keep if sample==1                                                        
tsset  state yr
tsfill, full                                                          //creating balanced time-series
bys state: replace state_name = state_name[_n-1] if state_name==""    //adding state labels to new year entries
replace membpc = L.membpc if membpc==.                                //if membpc missing, replacing with the last value known
keep if yr==1920                                                      //keeping 1920 values [now contains values that refer to the year before suffrage adoption] 
keep state state_name membpc                                          //preparing for merger with MMC_1920.dta
save "./nawsa20.dta" 
restore

*merging with master
use "./MMC_1920.dta"
merge m:1 state using "./nawsa20.dta"
drop _merge
             
save "./MMC_1920.dta", replace


*****************************************************
*Data extraction for 1920 data set
*****************************************************

drop if cd20>=52             //dropping counties that do not map into congressional districts in 1920
drop if redist2018==1        //dropping counties which changed boundaries between 1918 and 1920
drop if redistcd2018==1      //dropping counties that moved from one districts to another between 1920 and 1918 
drop if pty66>200 & pty66<.  //dropping third party incumbents 
drop if pty66==.             //dropping district with missing data on `incumbent party', which indicates districts where an incumbent did not serve full term [see MMC_build.do for further details]
keep if incrun20==1          //keeping  districts where incumbent run AND run for the same party
drop if v66_20>99            //dropping `uncontested' counties in 1920 [`99' cutoff-point accounts for scattering]
drop if v66_18>99            //dropping `uncontested' counties in 1918 [`99' cutoff-point accounts for scattering]


*****************************************************
*Data preparation for 1920 data set
*****************************************************

*generating centered variables for interaction
center awomen20, generate(awomen20x) 
center awomen20_litl, generate(awomen20_litlx) 
center awomen20_nbp,  generate(awomen20_nbpx) 
center awomen20_nbr,  generate(awomen20_nbrx) 
center awomen20_naa,  generate(awomen20_naax) 
center progsc66, generate(progsc66x)   
center progsc66I, generate(progsc66Ix) 
center progsc66P, generate(progsc66Px) 

*generating terciles of progressive score for Figure 5.
centile progsc66 , c( 0 33 66 100)  
ret li
gen     progsc66_3=1 if progsc66>=`r(c_1)' & progsc66<=`r(c_2)'
replace progsc66_3=2 if progsc66>`r(c_2)' & progsc66<=`r(c_3)'
replace progsc66_3=3 if progsc66>`r(c_3)' & progsc66<=`r(c_4)'
label define cat3 1 "Low" 2 "Med" 3 "High"
label val progsc66_3 cat3

*generating cutoffs for movement strength 
centile membpc , c( 0 60 100)                    //60th percentile 
ret li
gen nawsa_c="low" if  membpc>=`r(c_1)' & membpc<=`r(c_2)'
replace nawsa_c="high" if membpc>`r(c_2)' & membpc<=`r(c_3)'
gen nawsa_60 = .
replace nawsa_60 = 0 if nawsa_c=="low"
replace nawsa_60 = 1 if nawsa_c=="high"
drop nawsa_c

centile membpc , c( 0 40 100)                   //40th percentile
ret li
gen nawsa_c="low" if  membpc>=`r(c_1)' & membpc<=`r(c_2)'
replace nawsa_c="high" if membpc>`r(c_2)' & membpc<=`r(c_3)'
gen nawsa_40 = .
replace nawsa_40 = 0 if nawsa_c=="low"
replace nawsa_40 = 1 if nawsa_c=="high"
drop nawsa_c

centile membpc , c( 0 20 100)                   //20th percentile
ret li
gen nawsa_c="low" if  membpc>=`r(c_1)' & membpc<=`r(c_2)'
replace nawsa_c="high" if membpc>`r(c_2)' & membpc<=`r(c_3)'
gen nawsa_20 = .
replace nawsa_20 = 0 if nawsa_c=="low"
replace nawsa_20 = 1 if nawsa_c=="high"
drop nawsa_c

*generating dummies for each state 
forvalues i = 1/56 {
 gen s`i' = 0
 replace s`i' = 1 if state==`i'
 }

save "./MMC_1920.dta" , replace //this is the final version of MMC_1920 used for main analyses




*****************************************************
*****************************************************
**             CREATING 1918 DATA SET              **
*****************************************************
*****************************************************

*****************************************************
*Creating Stata data set with election data 
*****************************************************

*importing & cleaning data on elections
import excel "./MMC_raw.xlsx", sheet("h_election") firstrow clear 
gen sample = 0                                         //identifying states in the sample
foreach i in  1 2 3 4 5 6 11 12 14 12 14 21 22 23 24 34 25 31 33 34 35 36 37 40 41 42 43 45 47 48 49 51 52 53 54 56 66 {
replace sample =1 if  state==`i'
}
keep if sample==1                                      //keeping states in the sample only
drop sample
label var state  "ICPSR state code"
label var county "ICPSR county code"
keep state state_name county county_name cd16 rep16 dem16 turn16 cd18 rep18 dem18 turn18 cd20 //keeping only relevant variables
replace cd16=1 if cd16==98 & state==66                 //spotted different coding of states with a single at-large district [preparing for merger with bills65.dta, generted below]
replace cd18=1 if cd18==98 & state==66                 //spotted different coding of states with a single at-large district [preparing for merger with bills65.dta, generated below]

*generating variables
egen cd18_u = group(state cd18)                        //creating unique district identifiers
gen redistcd1816 = 0                                   //identifing counties which changed districts between 1918 and 1916 with a dummy 
replace redistcd1816 = 1 if cd18>cd16
replace redistcd1816 = 1 if cd18<cd16
gen dturn1816 = turn18-turn16                          //change in turnout between 1918 and 1916
foreach var of varlist rep16 dem16 {                   //county-level margin between Republican and Democratic candidates in 1916
gen `var'_0 = `var'
replace `var'_0 = 0 if `var'_0==.
}
gen margin16 = round(abs(dem16_0-rep16_0), 0.1)

save "./MMC_1918.dta"

*creating dummy variables for Rep/Dem entry/withdrawal between 1918 & 1916
use "./MMC_1918.dta"
preserve                                               //temporarily preserve to collapse at district level
gen sample = 0                                         //identifying states in the sample
foreach i in  1 2 3 4 5 6 11 12 14 12 14 21 22 23 24 34 25 31 33 34 35 36 37 40 41 42 43 45 47 48 49 51 52 53 54 56 66 {
replace sample =1 if  state==`i'
}
keep if sample==1                                      //keeping states in the sample only
drop sample
collapse (mean) rep18 dem18 rep16 dem16, by(cd18_u)
gen crep16= (rep16>0) if !missing(rep16)               //generating dummies for candidacy 
replace crep16 = 0 if crep16==.
gen cdem16= (dem16>0) if !missing(dem16)        
replace cdem16 = 0 if cdem16==.
gen crep18= (rep18>0) if !missing(rep18)        
replace crep18 = 0 if crep18==.
gen cdem18= (dem18>0) if !missing(dem18)        
replace cdem18 = 0 if cdem18==.   
gen dcr1816 = crep18-crep16                            //generating change in candidacy 
gen dcd1816 = cdem18-cdem16                           
gen wrd18 = 0                                          //generating withdrawal dummy
replace wrd18=1 if dcd1816==-1
replace wrd18=1 if dcr1816==-1
gen nrd18 = 0                                          //generating entry dummy
replace nrd18=1 if dcd1816==1
replace nrd18=1 if dcr1816==1
keep cd18_u wrd18 nrd18                                //creating data set to be merged with MMC_1918
save "./cd18.dta"
restore                                                //restoring to pre-collapsed data set at county level

*merging with master
use "./MMC_1918.dta"
merge m:1 cd18_u using "./cd18.dta"
drop _merge

save "./MMC_1918.dta" , replace


*****************************************************
*Adding votes on progressive bills in 65th Congress
*****************************************************

*importing & cleaning data on bills and incumbent party
import excel "./MMC_raw.xlsx", sheet("h_bills 65th") firstrow clear  //importing raw data
foreach var of varlist V22 - V228 {    //dropping districts where incumbent was a non-member on at least one of the bills, i.e. excluding incumbents who did not serve full term]
drop if `var' == 0
}	 
recode V22-V228 (2 3 4 5 6 7 8 9 = 0) //identifying `yea' votes casted in person
rename dist cd18                      //preparing for merger: to distinquish between districts of different congresses
rename party pty65                    //preparing for merger: to distinquish between incumbents of different congresses 
foreach var of varlist V22 - V228 {   //preparing for merger: to distinquish between bills from different congresses
rename `var' `var'_65
}
save "./bills65.dta"

*merging with master
use "./MMC_1918.dta", clear
merge m:1 state cd18 using "./bills65.dta"                    //57 districts in bills65.dta not matched [missing in h_election]; 326 counties in MMC_1918.dta not matched [these refer to counties that either do map into district boundaries - do not have a single assigned district in h_election - OR to counties that belong to districts that were excluded above for having incumbents who did not `serve full terms'].
drop if _merge==2                                             //dropping unmatched districts [see note above]  
drop _merge

*cleaning data
label var pty65 "ICPSR party code 65th Congress"

*generating variables
gen progsc65 = ((V22_65+V57_65+V61_65+V67_65+V69_65+V88_65+V93_65+V137_65+V151_65+V158_65+V160_65+V180_65+V200_65+V201_65+V223_65+V228_65)/16)*100  //list of bills in Appendix Table 4 and `progressive' sheet in MMC_raw.xls
gen dinc1816 = rep18 - rep16 if pty65==200                   //change in incumbent support between 1918 and 1916
replace dinc1816 = dem18 - dem16 if pty65==100
gen v65_18 = rep18 if pty65==200                             //highest Rep/Dem vote [used for robustness and identification of `uncontested' counties]
replace v65_18 = dem18 if pty65==100
gen v65_16 = rep16 if pty65==200
replace v65_16 = dem16 if pty65==100

save "./MMC_1918.dta", replace


*****************************************************
*Adding votes on progressive bills in 66th Congress - to replicate figures A13 & A14
*****************************************************

*merging with master bills66.dta [created above]
use "./MMC_1918.dta"
merge m:1 state cd20 using "./bills66.dta"                    //55 districts in bills66.dta not matched [missing in h_election]; 269 counties in MMC_1918.dta not matched [these refer to counties that either do map into district boundaries OR to counties that belong to districts that were excluded above for having incumbents who did not `serve full terms'].
drop if _merge==2                                             //dropping unmatched districts [see note above] 
drop _merge

*cleaning data
label var pty66 "ICPSR party code 66th Congress"

*generating variables
gen progsc66 = ((V2_66+V17_66+V33_66+V35_66+V41_66+V43_66+V96_66+V103_66+V104_66+V112_66+V141_66+V153_66+V170_66+V201_66+V207_66+V211_66+V215_66+V228_66+V238_66+V250_66+V310_66+V321_66)/22)*100 //list of bills in Appendix Table 3 and `progressive' sheet in MMC_raw.xls

save "./MMC_1918.dta", replace


*****************************************************
*Adding data on incumbents 65th Congress
*****************************************************

*extracting relevant data from cand.dta [created above] for MMC_1918.dta
use "./cand.dta"                                             // [created above]
preserve
gen sample = 0                                               //identifying states in the sample
foreach i in  1 2 3 4 5 6 11 12 14 12 14 21 22 23 24 34 25 31 33 34 35 36 37 40 41 42 43 45 47 48 49 51 52 53 54 56 66 {
replace sample =1 if  state==`i'
}
keep if sample==1
keep if year==1918
keep cd18  incrun  state 
rename incrun incrun18
save "./cand65.dta"
restore

*merging with master
use "./MMC_1918.dta"
merge m:1 state cd18 using "./cand65.dta"                    //61 districts in cand65.dta not matched [missing in h_election]; 110 counties in MMC_1918.dta not matched [these refer to counties that do not map into district boundaries].
drop if _merge==2                                            //dropping unmatched districts [see note above] 
drop _merge	
	
save "./MMC_1918.dta" , replace


*****************************************************
*Adding census data
*****************************************************

*importing & cleaning census data
import excel "./MMC_raw.xlsx", sheet("census") firstrow clear 
keep state county f21 m21 negm21 negf21 totpop urb25         //keeping only relevant variables
gen sample = 0                                               //identifying states in the sample
foreach i in  1 2 3 4 5 6 11 12 14 12 14 21 22 23 24 34 25 31 33 34 35 36 37 40 41 42 43 45 47 48 49 51 52 53 54 56 66 {
replace sample =1 if  state==`i'
}
keep if sample==1                                            //keeping states in the sample
drop sample
label var f21 "Female population 21+"
label var m21 "Male population 21+"
label var negm21 "Negro males 21+"
label var negf21 "Negro females 21+"
label var totpop "Total Population 1920"
label var urb25 "Population cities 25K+ 1920"

*generating variables
gen awomen20 = (f21/(m21+f21))*100              
gen ablack20 =((negm21+negf21)/(m21+f21))*100
gen urb20 = (urb25/totpop)*100

save "./census_short.dta"

*merging with master
use "./MMC_1918.dta"
merge 1:1 county using "./census_short.dta"  
drop _merge

save "./MMC_1918.dta" , replace


*****************************************************
*Adding data on dry counties in 1918
*****************************************************

*merging with master cleaned dry.dta [created above]
use "./MMC_1918.dta" 
gen ICPSRCNTY = substr(county_name, 1, 4)                      //preparing county identifier for merger
replace ICPSRCNTY = subinstr(ICPSRCNTY," ","",.) 
replace ICPSRCNTY = "OBRI" if state==31 & ICPSRCNTY=="OBR"     //enabling this county to be merged [correcting for different spelling in using and master]
duplicates tag state ICPSRCNTY, gen(amb)                       //identifies counties in one state with same ICPSRCNTY 
replace amb = 1 if amb>1                             
egen ambid = group(ICPSRCNTY amb state)  if amb==1             //identifies counties in one state with same ICPSRCNTY with a unique identifier 
bysort ambid (county_name): gen amb_rank = [_n] if amb==1      //sorting alphabetically within unique idnetifiers
replace amb_rank=0 if amb_rank==.
egen tempmerge=concat(ICPSRCNTY amb_rank)
merge 1:1 state tempmerge using "./dry.dta"                    //matching 1920 counties with 1918 counties; 13 counties not matched in dry.dta [missing]; 40 counties not matched in MMC_1918, 34 of which corrected below [these are 1918 counties that did not exist in 1920 or missing]
drop if _merge==2                                              //dropping observations that are not in master
drop _merge  ICPSRCNTY amb ambid amb_rank tempmerge AMBIGCHK                                 
replace PROH1918 = 9 if PROH1918==. & state==22                //replacing known missing values if entire state was dry
replace PROH1918 = 9 if PROH1918==. & state==37 
replace PROH1918 = 9 if PROH1918==. & state==40 

*generating variables
gen proh18_01= (PROH1918>0) if !missing(PROH1918) 

save "./MMC_1918.dta" , replace


*****************************************************
*Adding data on redistricting
*****************************************************

*importing data
import excel state_name county_name_r redist1816 using "./MMC_raw.xlsx", sheet("redistricting") cellrange(A68:C134)  clear 
save "./redist1816.dta"

*merging with master
use "./MMC_1918.dta"
gen county_name_r = county_name                                                //enabling merger [renaming county_names that are the same in the same states]
replace county_name_r = "VERMILLION2" if county==451150
replace county_name_r = "POINTE COUPEE 2" if county==450790
merge 1:1 state_name county_name_r using "./redist1816.dta"                    //[67 counties were matched, these refer to redistrected between 1918 and 1916] 
drop _merge 
merge 1:1 state_name county_name_r using "./redist2018.dta"                    //[redist2018.dta created above; 42 counties were matched, these refer to redistrected between 1920 and 1918] 
drop _merge county_name_r

*cleaning data
replace redist1816=0 if redist1816==.  
replace redist2018=0 if redist2018==.  
             
save "./MMC_1918.dta" , replace

*****************************************************
*Adding data on suffragists [for robustness]
*****************************************************

*merging with master
use "./MMC_1918.dta"                    //created above
merge m:1 state using "./suff.dta"
drop _merge
             
save "./MMC_1918.dta" , replace


*****************************************************
*Adding data on suffrage movement strength
*****************************************************

*merging nawsa20.dta [created above] with master
use "./MMC_1918.dta"
merge m:1 state_name using "./nawsa20.dta"
drop _merge
             
*generating variables
gen nawsa_60=0
replace nawsa_60 = 1 if membpc>=0.65                                           //empirically derived value using MMC_1920.dta created above

save "./MMC_1918.dta" , replace


*****************************************************
*Data extraction for 1918 data set
*****************************************************

drop if cd18>=52             //dropping counties that do not map into congressional districts in 1918
drop if redist1816==1        //dropping counties which changed boundaries between 1916 and 1918
drop if redist2018==1        //dropping counties which changed boundaries between 1918 and 1920 [so that 1920 census data are accurately mapped into 1918 counties]
drop if redistcd1816==1      //dropping counties that moved from one districts to another between 1918 and 1916 
drop if pty65>200 & pty65<.  //dropping third party incumbents 
drop if pty65==.             //dropping district with missing data on `incumbent party', which indicates districts where an incumbent did not serve full term [see MMC_build.do for further details]
keep if incrun18==1          //keeping  districts where incumbent run AND run for the same party
drop if v65_16>99            //dropping `uncontested' counties in 1916 [`99' cutoff-point accounts for scattering]
drop if v65_18>99            //dropping `uncontested' counties in 1918 [`99' cutoff-point accounts for scattering]

*****************************************************
*Data preparation for 1918 data set
*****************************************************

*generating centered variables for interaction
center awomen20, generate(awomen20x) 
center progsc65, generate(progsc65x)   

*generating terciles of progressive score for Figure 5.
centile progsc65 , c( 0 33 66 100)  
ret li
gen     progsc65_3=1 if progsc65>=`r(c_1)' & progsc65<=`r(c_2)'
replace progsc65_3=2 if progsc65>`r(c_2)' & progsc65<=`r(c_3)'
replace progsc65_3=3 if progsc65>`r(c_3)' & progsc65<=`r(c_4)'
label val progsc65_3 cat3

save "./MMC_1918.dta" , replace //this is the final version of MMC_1918 used for analyses

 

 
*****************************************************
*****************************************************
**             CREATING 1922 DATA SET              **
*****************************************************
*****************************************************

*****************************************************
*Creating Stata data set with election data 
*****************************************************

*importing & cleaning data on elections
import excel "./MMC_raw.xlsx", sheet("h_election") firstrow clear 
gen sample = 0                                         //identifying states in the sample
foreach i in  1 2 3 4 5 6 11 12 14 12 14 21 22 23 24 34 25 31 33 34 35 36 37 40 41 42 43 45 47 48 49 51 52 53 54 56 66 {
replace sample =1 if  state==`i'
}
keep if sample==1                                      //keeping states in the sample only
drop sample
label var state  "ICPSR state code"
label var county "ICPSR county code"
keep state state_name county county_name cd20 rep20 dem20 turn20 cd22 rep22 dem22 turn22 //keeping only relevant variables
replace cd20=1 if cd20==98 & state==66                 //spotted different coding of states with a single at-large district [preparing for merger with bills67.dta, generted below]
replace cd22=1 if cd22==98 & state==66                 //spotted different coding of states with a single at-large district [preparing for merger with bills67.dta, generated below]

*generating variables
egen cd22_u = group(state cd22)                        //creating unique district identifiers
gen redistcd2220 = 0                                   //identifing counties which changed districts between 1922 and 1920 with a dummy 
replace redistcd2220 = 1 if cd20>cd22
replace redistcd2220 = 1 if cd20<cd22
gen dturn2220 = turn22-turn20                          //change in turnout between 1922 and 1920
foreach var of varlist rep20 dem20 {                   //county-level margin between Republican and Democratic candidates in 1920 
gen `var'_0 = `var'
replace `var'_0 = 0 if `var'_0==.
}
gen margin20 = round(abs(dem20_0-rep20_0), 0.1)

save "./MMC_1922.dta"

*creating dummy variables for Rep/Dem entry/withdrawal between 1922 & 1910
use "./MMC_1922.dta"
preserve
gen sample = 0                                        //identifying states in the sample
foreach i in  1 2 3 4 5 6 11 12 14 12 14 21 22 23 24 34 25 31 33 34 35 36 37 40 41 42 43 45 47 48 49 51 52 53 54 56 66 {
replace sample =1 if  state==`i'
}
keep if sample==1                                     //keeping states in the sample only
drop sample
collapse (mean) rep22 dem22 rep20 dem20, by(cd22_u)   //temporarily preserve to collapse at district level
gen crep20= (rep20>0) if !missing(rep20)              //to create dummies for candidacy 
replace crep20 = 0 if crep20==.
gen cdem20= (dem20>0) if !missing(dem20)        
replace cdem20 = 0 if cdem20==.
gen crep22= (rep22>0) if !missing(rep22)        
replace crep22 = 0 if crep22==.
gen cdem22= (dem22>0) if !missing(dem22)        
replace cdem22 = 0 if cdem22==.      
gen dcr2220 = crep22-crep20                           //generating change in candidacy 
gen dcd2220 = cdem22-cdem20                           
gen wrd22 = 0                                         //generating withdrawal dummy
replace wrd22=1 if dcd2220==-1
replace wrd22=1 if dcr2220==-1
gen nrd22 = 0                                         //generating entry dummy
replace nrd22=1 if dcd2220==1
replace nrd22=1 if dcr2220==1
keep cd22_u wrd22 nrd22                               //creating data set to be merged with MMC_1922
save "./cd22.dta"
restore                                               //restoring to pre-collapsed data set at county level    
 
*merging with master
use "./MMC_1922.dta"
merge m:1 cd22_u using "./cd22.dta"
drop _merge

save "./MMC_1922.dta" , replace


*****************************************************
*Adding votes on progressive bills in 67th Congress
*****************************************************

*importing & cleaning data on bills and incumbent party
import excel "./MMC_raw.xlsx", sheet("h_bills 67th") firstrow clear  //importing raw data
foreach var of varlist V21 - V335 {                    //dropping districts where incumbent was a non-member on at least one of the bills, i.e. excluding incumbents who did not serve full term]
drop if `var' == 0
}	 
recode V21-V335 (2 3 4 5 6 7 8 9 = 0)                  //identifying `yea' votes casted in person
rename dist cd22                                       //preparing for merger: to distinquish between districts of different congresses
rename party pty67                                     //preparing for merger: to distinquish between incumbents of different congresses 
foreach var of varlist V21 - V335 {                    //preparing for merger: to distinquish between bills from different congresses
rename `var' `var'_67
}
save "./bills67.dta"

*merging with master
use "./MMC_1922.dta"
merge m:1 state cd22 using "./bills67.dta"                   //[55 districts in bills67.dta not matched [missing in h_election]; 324 counties in MMC_1922.dta not matched [these refer to counties that either do map into district boundaries - do not have a single assigned district in h_election - OR to counties that belong to districts that were excluded above for having incumbents who did not `serve full terms'].                                   
drop if _merge==2                                            //dropping unmatched districts [see note above]
drop _merge                                                   
  
*cleaning data
label var pty67 "ICPSR party code 67th Congress"

*generating variables
gen progsc67 = ((V21_67+V32_67+V36_67+V37_67+V53_67+V80_67+V89_67+V132_67+V169_67+V174_67+V194_67+V197_67+V198_67+V200_67+V225_67+V237_67+V247_67+V255_67+V296_67+V305_67+V323_67+V335_67)/22)*100    //list of bills in Appendix Table 5 and `progressive' sheet in MMC_raw.xls
gen dinc2220 = rep22 - rep20 if pty67==200                  //change in incumbent support between 1922 and 1920
replace dinc2220 = dem22 - dem20 if pty67==100
gen v67_22 = rep22 if pty67==200                            //highest Rep/Dem vote [used for robustness and identification of `uncontested' counties]
replace v67_22 = dem22 if pty67==100
gen v67_20 = rep20 if pty67==200
replace v67_20 = dem20 if pty67==100

save "./MMC_1922.dta", replace


*****************************************************
*Adding data on incumbents 67th Congress
*****************************************************

*extracting relevant data from cand.dta for MMC_1922.dta
use "./cand.dta"                                            //[data set created above]
preserve
gen sample = 0                                              //identifying states in the sample
foreach i in  1 2 3 4 5 6 11 12 14 12 14 21 22 23 24 34 25 31 33 34 35 36 37 40 41 42 43 45 47 48 49 51 52 53 54 56 66 {
replace sample =1 if  state==`i'
}
keep if sample==1
keep if year==1922
keep cd22  incrun  state 
rename incrun incrun22
save "./cand67.dta"
restore

*merging with master
use "./MMC_1922.dta"
merge m:1 state cd22 using "./cand67.dta"                    //[64 districts in cand67.dta not matched [missing in h_election]; 110 counties in MMC_1922.dta not matched [these refer to counties that do not map into district boundaries].
drop if _merge==2                                            //dropping unmatched districts [see note above] 
drop _merge	
	
save "./MMC_1922.dta" , replace


*****************************************************
*Adding census data
*****************************************************

*merging with master cleaned census_short.dta [created above]
use "./MMC_1922.dta"
merge 1:1 county using "./census_short.dta"
drop _merge

save "./MMC_1922.dta" , replace              

*****************************************************
*Adding data on dry counties in 1918
*****************************************************

*merging with master cleaned dry.dta [created above]
use "./MMC_1922.dta"
gen ICPSRCNTY = substr(county_name, 1, 4)                      //preparing county identifier for merger
replace ICPSRCNTY = subinstr(ICPSRCNTY," ","",.) 
replace ICPSRCNTY = "OBRI" if state==31 & ICPSRCNTY=="OBR"     //enabling this county to be merged [correcting for different spelling in using and master]
duplicates tag state ICPSRCNTY, gen(amb)                       //identifies counties in one state with same ICPSRCNTY 
replace amb = 1 if amb>1                             
egen ambid = group(ICPSRCNTY amb state)  if amb==1             //identifies counties in one state with same ICPSRCNTY with a unique identifier 
bysort ambid (county_name): gen amb_rank = [_n] if amb==1      //sorting alphabetically within unique idnetifiers
replace amb_rank=0 if amb_rank==.
egen tempmerge=concat(ICPSRCNTY amb_rank)
merge 1:1 state tempmerge using "./dry.dta"                    //matching 1920 counties with 1918 counties; 13 counties not matched in dry.dta [missing]; 40 counties not matched in MMC_1922, 39 of which corrected below [these are 1918 counties that did not exist in 1920 or missing]
drop if _merge==2                                              //dropping observations that are not in master
drop _merge  ICPSRCNTY amb ambid amb_rank tempmerge AMBIGCHK                                 
replace PROH1918 = 9 if PROH1918==. & state==22                //replacing known missing values if entire state was dry
replace PROH1918 = 9 if PROH1918==. & state==37 
replace PROH1918 = 9 if PROH1918==. & state==40 

*generating variables
gen proh18_01= (PROH1918>0) if !missing(PROH1918) 

save "./MMC_1922.dta" , replace


*****************************************************
*Adding data on redistricting
*****************************************************

*importing data
import excel state_name county_name_r redist2220 using "./MMC_raw.xlsx", sheet("redistricting") cellrange(A135:C199)  clear 
save "./redist2220.dta"

*merging with master
use "./MMC_1922.dta", clear
gen county_name_r = county_name //enabling merger [renaming county_names that are the same in the same states]
replace county_name_r = "VERMILLION2" if county==451150
replace county_name_r = "POINTE COUPEE 2" if county==450790
merge 1:1 state_name county_name_r using "./redist2220.dta"  //[65 counties were matched, these refer to redistrected between 1922 and 1920] 
drop _merge 
merge 1:1 state_name county_name_r using "./redist2018.dta"  //[redist2018.dta created above; 42 counties were matched, these refer to redistrected between 1920 and 1918] 
drop _merge county_name_r

*cleaning data
replace redist2220=0 if redist2220==.  
replace redist2018=0 if redist2018==.  
  
   
save "./MMC_1922.dta" , replace


**********************************************
*Adding data on suffrage movement strength
*****************************************************

*merging with master cleaned nawsa20.dta [created above]
use "./MMC_1922.dta"
merge m:1 state_name using "./nawsa20.dta"
drop _merge
             
*generating variables
gen nawsa_60=0
replace nawsa_60 = 1 if membpc>=0.65                                           //empirically derived value using MMC_1920.dta created above

save "./MMC_1922.dta" , replace

*****************************************************
*Data extraction for 1922 data set
*****************************************************

drop if cd22>=52             //dropping counties that do not map into congressional districts in 1922
drop if redist2220==1        //dropping counties which changed boundaries between 1922 and 1920
drop if redist2018==1        //dropping counties which changed boundaries between 1918 and 1920 [so that 1920 census data are accurately mapped into 1922 counties]
drop if redistcd2220==1      //dropping counties that moved from one districts to another between 1922 and 1920 
drop if pty67>200 & pty67<.  //dropping third party incumbents 
drop if pty67==.             //dropping district with missing data on `incumbent party', which indicates districts where an incumbent did not serve full term [see MMC_build.do for further details]
keep if incrun22==1          //keeping  districts where incumbent run AND run for the same party
drop if v67_22>99            //dropping `uncontested' counties in 1922 [`99' cutoff-point accounts for scattering]
drop if v67_20>99            //dropping `uncontested' counties in 1920 [`99' cutoff-point accounts for scattering]

*****************************************************
*Data preparation for 1922 data set
*****************************************************

*generating centered variables for interaction
center awomen20, generate(awomen20x) 
center progsc67, generate(progsc67x)   

*generating terciles of progressive score for Figure 5.
centile progsc67 , c( 0 33 66 100)  
ret li
gen     progsc67_3=1 if progsc67>=`r(c_1)' & progsc67<=`r(c_2)'
replace progsc67_3=2 if progsc67>`r(c_2)' & progsc67<=`r(c_3)'
replace progsc67_3=3 if progsc67>=`r(c_3)' & progsc67<=`r(c_4)'    //[`=' moved to improve similar sample sizes across terciles
label val progsc67_3 cat3

save "./MMC_1922.dta" , replace //this is the final version of MMC_1922 used for analyses




*****************************************************
*****************************************************
**        CREATING 1922 LONG DATA SET              **
*****************************************************
*****************************************************


*****************************************************
*Creating Stata data set with election data 
*****************************************************

*importing & cleaning data on elections
import excel "./MMC_raw.xlsx", sheet("h_election") firstrow clear 
gen sample = 0                                         //identifying states in the sample
foreach i in  1 2 3 4 5 6 11 12 14 12 14 21 22 23 24 34 25 31 33 34 35 36 37 40 41 42 43 45 47 48 49 51 52 53 54 56 66 {
replace sample =1 if  state==`i'
}
keep if sample==1                                      //keeping states in the sample only
drop sample
label var state  "ICPSR state code"
label var county "ICPSR county code"
keep state state_name county county_name cd22 rep22 dem22 turn22 cd18 rep18 dem18 turn18  //keeping only relevant variables
replace cd18=1 if cd18==98 & state==66                 //spotted different coding of states with a single at-large district [preparing for merger with bills67.dta, generated below]
replace cd22=1 if cd22==98 & state==66                 //spotted different coding of states with a single at-large district [preparing for merger with bills67.dta, generated below]

*generating variables
egen cd22_u = group(state cd22)                        //creating unique district identifiers
gen redistcd2218 = 0                                   //identifing counties which changed districts between 1922 and 1918 with a dummy 
replace redistcd2218 = 1 if cd18>cd22
replace redistcd2218 = 1 if cd18<cd22
foreach var of varlist rep18 dem18 {                   //county-level margin between Republican and Democratic candidates in 1918
gen `var'_0 = `var'
replace `var'_0 = 0 if `var'_0==.
}
gen margin18 = round(abs(dem18_0-rep18_0), 0.1) 

save "./MMC_1922long.dta"

*creating dummy variables for Rep/Dem entry/withdrawal between 1918 & 1916
use "./MMC_1922long.dta"
preserve                                               //temporarily preserve to collapse at district level
gen sample = 0                                         //identifying states in the sample
foreach i in  1 2 3 4 5 6 11 12 14 12 14 21 22 23 24 34 25 31 33 34 35 36 37 40 41 42 43 45 47 48 49 51 52 53 54 56 66 {
replace sample =1 if  state==`i'
}
keep if sample==1                                      //keeping states in the sample only
drop sample
collapse (mean) rep18 dem18 rep22 dem22, by(cd22_u)
gen crep22= (rep22>0) if !missing(rep22)               //generating dummies for candidacy 
replace crep22 = 0 if crep22==.
gen cdem22= (dem22>0) if !missing(dem22)        
replace cdem22 = 0 if cdem22==.
gen crep18= (rep18>0) if !missing(rep18)        
replace crep18 = 0 if crep18==.
gen cdem18= (dem18>0) if !missing(dem18)        
replace cdem18 = 0 if cdem18==.              
gen dcr2218 = crep22-crep18                            //generating change in candidacy 
gen dcd2218 = cdem22-cdem18                           
gen wrd2218 = 0                                        //generating withdrawal dummy
replace wrd2218=1 if dcd2218==-1
replace wrd2218=1 if dcr2218==-1
gen nrd2218 = 0                                        //generating entry dummy
replace nrd2218=1 if dcd2218==1
replace nrd2218=1 if dcr2218==1
keep cd22_u wrd2218 nrd2218                            //creating data set to be merged with MMC_1922long
save "./cd22long.dta"
restore                                                //restoring to pre-collapsed data set at county level

*merging with master
use "./MMC_1922long.dta"
merge m:1 cd22_u using "./cd22long.dta"
drop _merge

save "./MMC_1922long.dta", replace


*****************************************************
*Adding votes on progressive bills in 67th Congress
*****************************************************

*merging with master cleaned dry.dta [created above]
use "./MMC_1922long.dta"
merge m:1 state cd22 using "./bills67.dta"                    //[55 districts in bills66.dta not matched [missing in h_election]; 324 counties in MMC_1922long.dta not matched [these refer to counties that either do map into district boundaries - do not have a single assigned district in h_election - OR to counties that belong to districts that were excluded above for having incumbents who did not `serve full terms'].
drop if _merge==2                                             //dropping unmatched districts [see note above]  
drop _merge

*cleaning data
label var pty67 "ICPSR party code 67th Congress"

*generating variables
gen progsc67 = ((V21_67+V32_67+V36_67+V37_67+V53_67+V80_67+V89_67+V132_67+V169_67+V174_67+V194_67+V197_67+V198_67+V200_67+V225_67+V237_67+V247_67+V255_67+V296_67+V305_67+V323_67+V335_67)/22)*100   //list of bills in Appendix Table 5 and `progressive' sheet in MMC_raw.xls
gen dinc2218 = rep22 - rep18 if pty67==200                   //change in incumbent support between 1922 and 1920
replace dinc2218 = dem22 - dem18 if pty67==100
gen v67_22 = rep22 if pty67==200                             //highest Rep/Dem vote [used for robustness and identification of `uncontested' counties]replace v67_22 = dem22 if ptyrd67==100
replace v67_22 = dem22 if pty67==100
gen v67_18 = rep18 if pty67==200
replace v67_18 = dem18 if pty67==100

save "./MMC_1922long.dta", replace


*****************************************************
*Adding data on incumbents 66th-67th Congress
*****************************************************

*extracting relevant data from cand.dta for MMC_1922long.dta
use "./cand.dta"                                                    //[data set created above]
preserve
gen sample = 0                                                      //identifying states in the sample
foreach i in  1 2 3 4 5 6 11 12 14 12 14 21 22 23 24 34 25 31 33 34 35 36 37 40 41 42 43 45 47 48 49 51 52 53 54 56 66 {
replace sample =1 if  state==`i'
}
keep if sample==1
keep if year==1922
keep cd22  incrun2  state 
rename incrun2 incrun22long
save "./cand67long.dta"
restore

*merging with master
use "./MMC_1922long.dta"

merge m:1 state cd22 using "./cand67long.dta"                      //[64 districts in cand67lond.dta not matched [missing in h_election]; 110 counties in MMC_1922long.dta not matched [these refer to counties that do not map into district boundaries].
drop if _merge==2                                                  //dropping unmatched districts [see note above] 
drop _merge	
	
save "./MMC_1922long.dta" , replace


*****************************************************
*Adding census data
*****************************************************

*merging with master cleaned census_short.dta [created above]
use "./MMC_1922long.dta"
merge 1:1 county using "./census_short.dta"    
drop _merge

save "./MMC_1922long.dta" , replace 


*****************************************************
*Adding data on dry counties in 1918
*****************************************************

*merging with master cleaned dry.dta [created above]
use "./MMC_1922long.dta"
gen ICPSRCNTY = substr(county_name, 1, 4)                      //preparing county identifier for merger
replace ICPSRCNTY = subinstr(ICPSRCNTY," ","",.) 
replace ICPSRCNTY = "OBRI" if state==31 & ICPSRCNTY=="OBR"     //enabling this county to be merged [correcting for different spelling in using and master]
duplicates tag state ICPSRCNTY, gen(amb)                       //identifies counties in one state with same ICPSRCNTY 
replace amb = 1 if amb>1                             
egen ambid = group(ICPSRCNTY amb state)  if amb==1             //identifies counties in one state with same ICPSRCNTY with a unique identifier 
bysort ambid (county_name): gen amb_rank = [_n] if amb==1      //sorting alphabetically within unique idnetifiers
replace amb_rank=0 if amb_rank==.
egen tempmerge=concat(ICPSRCNTY amb_rank)
merge 1:1 state tempmerge using "./dry.dta"  //matching 1920 counties with 1918 counties; 13 counties not matched in dry.dta [missing]; 40 counties not matched in MMC_1922long, 39 of which corrected below [these are 1918 counties that did not exist in 1920 or missing]
drop if _merge==2                                              //dropping observations that are not in master
drop _merge  ICPSRCNTY amb ambid amb_rank tempmerge AMBIGCHK                                 
replace PROH1918 = 9 if PROH1918==. & state==22                //replacing known missing values if entire state was dry
replace PROH1918 = 9 if PROH1918==. & state==37 
replace PROH1918 = 9 if PROH1918==. & state==40 

*generating variables
gen proh18_01= (PROH1918>0) if !missing(PROH1918) 

save "./MMC_1922long.dta" , replace


*****************************************************
*Adding data on redistricting
*****************************************************

*merging with master 
use "./MMC_1922long.dta", clear
gen county_name_r = county_name //enabling merger [renaming county_names that are the same in the same states]
replace county_name_r = "VERMILLION2" if county==451150
replace county_name_r = "POINTE COUPEE 2" if county==450790
merge 1:1 state_name county_name_r using "./redist2018.dta"  //[42 counties were matched, these refer to redistrected between 1920 and 1918] 
drop _merge 
merge 1:1 state_name county_name_r using "./redist2220.dta"  //[65 counties were matched, these refer to redistrected between 1922 and 1920] 
drop _merge county_name_r

*cleaning data
replace redist2018=0 if redist2018==.  
replace redist2220=0 if redist2220==.  
        
save "./MMC_1922long.dta" , replace


**********************************************
*Adding data on suffrage movement strength
*****************************************************

*merging with master cleaned nawsa20.dta [created above]
use "./MMC_1922long.dta"
merge m:1 state_name using "./nawsa20.dta"
drop _merge
            
*generating variables
gen nawsa_60=0
replace nawsa_60 = 1 if membpc>=0.65                                           //empirically derived value using MMC_1920.dta created above

save "./MMC_1922long.dta" , replace

*****************************************************
*Data extraction for 1922_long data set
*****************************************************

drop if cd22>=52             //dropping counties that do not map into congressional districts in 1922
drop if cd18>=52             //dropping counties that do not map into congressional districts in 1920
drop if redist2220==1        //dropping counties which changed boundaries between 1922 and 1920
drop if redist2018==1        //dropping counties which changed boundaries between 1920 and 1918
drop if redistcd2218==1      //dropping counties that moved from one districts to another between 1922 and 1918 
drop if pty67>200 & pty67<.  //dropping third party incumbents 
drop if pty67==.             //dropping district with missing data on `incumbent party', which indicates districts where an incumbent did not serve full term [see MMC_build.do for further details]
keep if incrun22long==1      //keeping  districts where incumbent run AND run for the same party
drop if v67_22>99            //dropping `uncontested' counties in 1922 [`99' cutoff-point accounts for scattering]
drop if v67_18>99            //dropping `uncontested' counties in 1918 [`99' cutoff-point accounts for scattering]

*****************************************************
*Data preparation for 1922_long data set
*****************************************************

*generating centered variables for interaction
center awomen20, generate(awomen20x) 
center progsc67, generate(progsc67x)   

save "./MMC_1922long.dta" , replace //this is the final version of MMC_1922long used for analyses




*****************************************************
*****************************************************
**        CREATING 1920 WEST DATA SET              **
*****************************************************
*****************************************************

*****************************************************
*Creating Stata data set with election data 
*****************************************************

*importing & cleaning data on elections
import excel "./MMC_raw.xlsx", sheet("h_election") firstrow clear 
gen sample = 0                                          //identifying states in the sample for `west' data set [i.e. state mostly in the West whhere women could vote to Congress before the 1920 election.]
foreach i in  13 32 61 62 63 64 65 67 68 71 72 73 {
replace sample =1 if  state==`i'
}
keep if sample==1                                       //keeping states in the sample 
drop sample
label var state  "ICPSR state code"
label var county "ICPSR county code"
keep state state_name county county_name cd20 rep20 dem20 turn20 cd18 rep18 dem18 turn18  //keeping only relevant variables
replace cd20=1 if cd20==98 & state==68                  //spotted different coding of states with a single at-large district [preparing for merger with prog66.dta, generated below]
replace cd20=1 if cd20==98 & state==65                  //spotted different coding of states with a single at-large district [preparing for merger with prog66.dta, generated below]
replace cd20=1 if cd20==98 & state==61                  //spotted different coding of states with a single at-large district [preparing for merger with prog66.dta, generated below]
replace cd18=1 if cd18==98 & state==68                  //spotted different coding of states with a single at-large district [preparing for merger with prog66.dta, generated below]
replace cd18=1 if cd18==98 & state==65                  //spotted different coding of states with a single at-large district [preparing for merger with prog66.dta, generated below]
replace cd18=1 if cd18==98 & state==61                  //spotted different coding of states with a single at-large district [preparing for merger with prog66.dta, generated below]

*generating variables
egen cd20_u = group(state cd20)                        //creating unique district identifiers
egen cd18_u = group(state cd18)
gen redistcd2018 = 0                                   //identifing counties which changed districts between 1920 and 1918 with a dummy [none]
replace redistcd2018 = 1 if cd20>cd18
replace redistcd2018 = 1 if cd20<cd18
gen dturn2018 = turn20-turn18                          //change in turnout between 1920 and 1918
foreach var of varlist rep18 dem18 {                   //county-level margin between Republican and Democratic candidates in 1918
gen `var'_0 = `var'
replace `var'_0 = 0 if `var'_0==.
}
gen margin18 = round(abs(dem18_0-rep18_0), 0.1) 

save "./MMC_1920west.dta"

*creating dummy variables for Rep/Dem entry/withdrawal between 1920 & 1918
use "./MMC_1920west.dta"
preserve                                               //temporarily preserve to collapse at district level
gen sample = 0                                         //identifying states in the sample for `west' data set [i.e. state mostly in the West whhere women could vote to Congress before the 1920 election.]
foreach i in  13 32 61 62 63 64 65 67 68 71 72 73 {
replace sample =1 if  state==`i'
}
keep if sample==1                                      //keeping states in the sample 
drop sample
collapse (mean) rep18 dem18 rep20 dem20, by(cd20_u)
gen crep20= (rep20>0) if !missing(rep20)               //generating dummies for candidacy 
replace crep20 = 0 if crep20==.
gen cdem20= (dem20>0) if !missing(dem20)        
replace cdem20 = 0 if cdem20==.
gen crep18= (rep18>0) if !missing(rep18)        
replace crep18 = 0 if crep18==.
gen cdem18= (dem18>0) if !missing(dem18)        
replace cdem18 = 0 if cdem18==.              
gen dcr2018 = crep20-crep18                            //generating change in candidacy 
gen dcd2018 = cdem20-cdem18                           
gen wrd20 = 0                                          //generating withdrawal dummy
replace wrd20=1 if dcd2018==-1
replace wrd20=1 if dcr2018==-1
gen nrd20 = 0                                          //generating entry dummy
replace nrd20=1 if dcd2018==1
replace nrd20=1 if dcr2018==1
keep cd20_u wrd20 nrd20                                //creating data set to be merged with MMC_1920
save "./cd20w.dta"
restore                                                //restoring to pre-collapsed data set at county level

*merging with master
use "./MMC_1920west.dta"
merge m:1 cd20_u using "./cd20w.dta"
drop _merge

save "./MMC_1920west.dta" , replace


*****************************************************
*Adding votes on progressive bills in 66th Congress
*****************************************************

*importing & cleaning data on bills and incumbent party
import excel "./MMC_raw.xlsx", sheet("h_bills 66th") firstrow clear  //importing raw data
gen sample = 0                                                //identifying states in the sample for `west' data set [i.e. state mostly in the West whhere women could vote to Congress before the 1920 election.]
foreach i in  13 32 61 62 63 64 65 67 68 71 72 73 {
replace sample =1 if  state==`i'
}
keep if sample==1                                             //keeping states in the sample 
drop sample
foreach var of varlist V2 - V321 {                            //dropping districts where incumbent was a non-member on at least one of the bills, i.e. excluding incumbents who did not serve full term]
drop if `var' == 0
}	 
recode V2-V321 (2 3 4 5 6 7 8 9 = 0)                          //identifying `yea' votes casted in person
rename dist cd20                                              //preparing for merger: to distinquish between districts of different congresses
rename party pty66                                            //preparing for merger: to distinquish between incumbents of different congresses 
foreach var of varlist V2 - V321 {                            //preparing for merger: to distinquish between bills from different congresses
rename `var' `var'_66
}
save "./bills66w.dta"

*merging with master
use "./MMC_1920west.dta"
merge m:1 state cd20 using "./bills66w.dta"  //[26 districts in bills66w.dta not matched [missing in h_election; 12 counties in MMC_1920w.dta not matched [these refer to counties that either do map into district boundaries - do not have a single assigned district in h_election - OR to counties that belong to districts that were excluded above for having incumbents who did not `serve full terms'].
drop if _merge==2                                              //dropping unmatched districts [see note above]  
drop _merge

*cleaning data
label var pty66 "ICPSR party code 66th Congress"

*generating variables
gen progsc66 = ((V2_66+V17_66+V33_66+V35_66+V41_66+V43_66+V96_66+V103_66+V104_66+V112_66+V141_66+V153_66+V170_66+V201_66+V207_66+V211_66+V215_66+V228_66+V238_66+V250_66+V310_66+V321_66)/22)*100 //progressive score for 66th congress, list of bills in Appendix Table 3 and `progressive' sheet in MMC_raw.xls
gen dinc2018 = rep20 - rep18 if pty66==200                   //change in incumbent support between 1920 and 1918
replace dinc2018 = dem20 - dem18 if pty66==100
gen v66_20 = rep20 if pty66==200                             //highest Rep/Dem vote [used for robustness and identification of `uncontested' counties]
replace v66_20 = dem20 if pty66==100
gen v66_18 = rep18 if pty66==200
replace v66_18 = dem18 if pty66==100

save "./MMC_1920west.dta", replace


*****************************************************
*Adding data on incumbents 66th Congress
*****************************************************

*extracting relevant data for MMC_1920west
use "./cand.dta"
preserve
gen sample = 0                                                 //identifying states in the sample for `west' data set [i.e. state mostly in the West whhere women could vote to Congress before the 1920 election.]
foreach i in  13 32 61 62 63 64 65 67 68 71 72 73 {
replace sample =1 if  state==`i'
}
keep if sample==1                                              //keeping states in the sample 
keep if year==1920
keep cd20  incrun  state 
rename incrun incrun20
save "./cand66w.dta"
restore

*merging with master
use "./MMC_1920west.dta"
merge m:1 state cd20 using "./cand66w.dta"                     //[29 districts in cand66w.dta not matched [missing in h_election]; 9 counties in MMC_1920w.dta not matched [these refer to counties that do not map into district boundaries].
drop if _merge==2                                              //dropping unmatched districts [see note above] 
drop _merge	
	
save "./MMC_1920west.dta" , replace


*****************************************************
*Adding census data
*****************************************************

*importing census data
import excel "./MMC_raw.xlsx", sheet("census") firstrow clear 
keep state county f21 m21 negm21 negf21 totpop urb25         //keeping only relevant variables
gen sample = 0                                               //identifying states in the sample for `west' data set [i.e. state mostly in the West whhere women could vote to Congress before the 1920 election.]
foreach i in  13 32 61 62 63 64 65 67 68 71 72 73 {
replace sample =1 if  state==`i'
}
keep if sample==1                                            //keeping states in the sample 
drop sample
label var f21 "Female population 21+"
label var m21 "Male population 21+"
label var negm21 "Negro males 21+"
label var negf21 "Negro females 21+"
label var totpop "Total Population 1920"
label var urb25 "Population cities 25K+ 1920"

*generating variables
gen awomen20 = (f21/(m21+f21))*100              
gen ablack20 =((negm21+negf21)/(m21+f21))*100
gen urb20 = (urb25/totpop)*100

save "./censusw.dta"

*merging with master
use "./MMC_1920west.dta"
merge 1:1 county using "./censusw.dta"     
drop _merge

save "./MMC_1920west.dta" , replace


*****************************************************
*Adding data on dry counties in 1918
*****************************************************

*importing & cleaning data
import excel "./MMC_raw.xlsx", sheet("dry") firstrow clear 
replace ICPSRCNTY = subinstr(ICPSRCNTY," ","",.)               //removing empty spaces
egen tempmerge=concat(ICPSRCNTY AMBIGCHK)                      //generating unique identifier for merge             
gen sample = 0                                                 //identifying states in the sample for `west' data set [i.e. state mostly in the West whhere women could vote to Congress before the 1920 election.]
foreach i in  13 32 61 62 63 64 65 67 68 71 72 73 {
replace sample =1 if  ICPSRSTATE==`i'
}
keep if sample==1                                              //keeping states in the sample 
drop sample
rename ICPSRSTATE state                                        //preparing for merge
save "./dryw.dta"

*merging with master
use "./MMC_1920west.dta"
gen ICPSRCNTY = substr(county_name, 1, 4)                      //preparing county identifier for merger
replace ICPSRCNTY = subinstr(ICPSRCNTY," ","",.) 
replace ICPSRCNTY = "OBRI" if state==31 & ICPSRCNTY=="OBR"     //enabling this county to be merged [correcting for different spelling in using and master]
duplicates tag state ICPSRCNTY, gen(amb)                       //identifies counties in one state with same ICPSRCNTY 
replace amb = 1 if amb>1                             
egen ambid = group(ICPSRCNTY amb state)  if amb==1             //identifies counties in one state with same ICPSRCNTY with a unique identifier 
bysort ambid (county_name): gen amb_rank = [_n] if amb==1      //sorting alphabetically within unique idnetifiers
replace amb_rank=0 if amb_rank==.
egen tempmerge=concat(ICPSRCNTY amb_rank)                      //creates identifier for merger
merge 1:1 state tempmerge using "./dryw.dta"                   //matching 1920 counties with 1918 counties; 3 counties not matched in dry.dta [missing]; 42 counties not matched in MMC_1920 west, 1 of which corrected below [these are 1918 counties that did not exist in 1920 or missing]
drop if _merge==2                                              //dropping observations that are not in master 
drop _merge  ICPSRCNTY amb ambid amb_rank tempmerge AMBIGCHK                                 
replace PROH1918 = 9 if PROH1918==. & state==61                //replacing known missing values if entire state was dry

*generating variables
gen proh18_01= (PROH1918>0) if !missing(PROH1918) 

save "./MMC_1920west.dta" , replace


*****************************************************
*Adding data on redistricting
*****************************************************

*importing data
import excel state_name county_name_r redist2018w using "./MMC_raw.xlsx", sheet("redistricting") cellrange(A44:C67)  clear 
save "./redist2018w.dta"

*merging with master
use "./MMC_1920west.dta"
gen county_name_r = county_name //enabling merger [renaming county_names that are the same in the same states]
merge 1:1 state_name county_name_r using "./redist2018w.dta"  //[24 counties were matched, these refer to redistricted between 1920 and 1918] 
drop _merge county_name_r

*cleaning data
replace redist2018w=0 if redist2018w==.  
       
save "./MMC_1920west.dta" , replace


*****************************************************
*Adding data on suffrage movement strength
*****************************************************

*extracting and cleaning data from MMC_nawsa.dta [created above]
use "./MMC_nawsa.dta"                                                  //[created above]
preserve   
gen samplew = 0                                                        //identifying states in the sample for `west' data set [i.e. state mostly in the West whhere women could vote to Congress before the 1920 election.]
foreach i in  13 32 61 62 63 64 65 67 68 71 72 73 {
replace samplew =1 if  state==`i'
}
keep if samplew==1                                                     //keeping states in the sample 
drop samplew                                                       
tsset  state yr
tsfill, full                                                          //creating balanced time-series
bys state: replace state_name = state_name[_n-1] if state_name==""    //adding state labels to new year entries
replace membpc = L.membpc if membpc==.                                //if membpc missing, replacing with the last value known
keep if yr==1919                                                      //keeping 1919 values  
keep state state_name membpc                                          //preparing for merger with MMC_1920west.dta
save "./nawsa20w.dta" 
restore

*merging with master cleaned nawsa20.dta [created above]
use "./MMC_1920west.dta"
merge m:1 state_name using "./nawsa20w.dta"
drop _merge

save "./MMC_1920west.dta" , replace


*****************************************************
*Data extraction for 1920west data set
*****************************************************

drop if cd20>=52             //dropping counties that do not map into congressional districts
drop if redist2018w==1       //dropping counties which changed boundaries between 1918 and 1920
drop if redistcd2018==1      //dropping counties that moved from one districts to another between 1920 and 1918 
drop if pty66>200 & pty66<.  //dropping third party incumbents 
drop if pty66==.             //dropping district with missing data on `incumbent party', which indicates districts where an incumbent did not serve full term [see MMC_build.do for further details]
keep if incrun20==1          //keeping districts where incumbent run AND run for the same party
drop if v66_18>99            //dropping `uncontested' counties in 1918 [`99' cutoff-point accounts for scattering]
drop if v66_20>99            //dropping `uncontested' counties in 1920 [`99' cutoff-point accounts for scattering]


*****************************************************
*Data preparation for 1920west data set
*****************************************************

*generating centered variables for interaction
center awomen20, generate(awomen20x) 
center progsc66, generate(progsc66x)   

save "./MMC_1920west.dta" , replace  //this is the final version of MMC_1920w used for analyses




*****************************************************
*****************************************************
**          CREATING DW-PROG DATA SET              **
*****************************************************
*****************************************************


*importing & cleaning data on dw-nominate 
import excel "./MMC_raw.xlsx", sheet("dw-nominate") firstrow clear  //importing raw data
duplicates tag state cd, gen(dupl)      //tagging duplicates  [This tags districts with incumbents who did not serve full term.]
drop if dupl==1                         //dropping duplicates [This enables merger with prog.dta created below.]
rename cd dist                          //preparing for merger with prog.dta [created below] 
save "./dwnom.dta"

*importing & cleaning data on bills and incumbent party
import excel "./MMC_raw.xlsx", sheet("h_bills 66th") firstrow clear  //importing raw data
duplicates tag state dist, gen(dupl)    //tagging duplicates  [This tags districts with incumbents who did not serve full term.]
drop if dupl==1                         //dropping duplicates [The rationale is that these incumbents did not have a chance to vote on all progressive bills - often having a chance to vote only on a handful of progressive bills.]
recode V2-V321 (2 3 4 5 6 7 8 9 = 0)    //identifying `yea' votes casted in person
gen progsc66 = ((V2+V17+V33+V35+V41+V43+V96+V103+V104+V112+V141+V153+V170+V201+V207+V211+V215+V228+V238+V250+V310+V321)/22)*100  //generating progressive score
keep state dist progsc66 party
gen sample = 0                          //identifying states in the sample
foreach i in  1 2 3 4 5 6 11 12 14 12 14 21 22 23 24 34 25 31 33 34 35 36 37 40 41 42 43 45 47 48 49 51 52 53 54 56 66 {
replace sample =1 if  state==`i'
}
drop if sample==0
save "./prog.dta"

*merging bills.dta and dwnom.dta 
use "./prog.dta"
merge 1:1 state dist using "./dwnom.dta"  //7 observations not merged: 1) one dw-nominate score is missing [not calculated] (MA, dist no.3) in the original data set. 2) six districts in the dw-nominate data set indicate single incumbent per Congress [and were therefore not dropped above], even though two consecutive representatives were elected.

save "./MMC_dwprog.dta" 




*****************************************************
*****************************************************
**           CREATING SHIFTS DATA SET              **
*****************************************************
*****************************************************

*importing & cleaning data on bills and incumbent party in 68th Congress
import excel "./MMC_raw.xlsx", sheet("h_bills 68th") firstrow clear  //importing raw data
foreach var of varlist V41 - V166 {    //dropping districts where incumbent was a non-member on at least one of the bills, i.e. excluding incumbents who did not serve full term]
drop if `var' == 0
}	 
recode V41-V166 (2 3 4 5 6 7 8 9 = 0)  //identifying `yea' votes casted in person
rename dist cd22                       //preparing for merger: to distinquish between districts of different congresses
rename party pty68                     //preparing for merger: to distinquish between incumbents of different congresses 
foreach var of varlist V41 - V166 {    //preparing for merger: to distinquish between bills from different congresses
rename `var' `var'_68
}
save "./bills68.dta"

*creating shits data set with existing data sets [created above]
use "./bills65.dta"
append  using "./bills66.dta"
append  using "./bills67.dta"
append  using "./bills68.dta"
merge m:1 state using "./nawsa20.dta"

*generating variables [see list of progressive bills in Appendix Tables 3-5 and `progressive' sheet in MMC_raw.xls]
gen progsc65 = ((V22_65+V57_65+V61_65+V67_65+V69_65+V88_65+V93_65+V137_65+V151_65+V158_65+V160_65+V180_65+V200_65+V201_65+V223_65+V228_65)/16)*100
gen progsc66 = ((V2_66+V17_66+V33_66+V35_66+V41_66+V43_66+V96_66+V103_66+V104_66+V112_66+V141_66+V153_66+V170_66+V201_66+V207_66+V211_66+V215_66+V228_66+V238_66+V250_66+V310_66+V321_66)/22)*100 
gen progsc67 = ((V21_67+V32_67+V36_67+V37_67+V53_67+V80_67+V89_67+V132_67+V169_67+V174_67+V194_67+V197_67+V198_67+V200_67+V225_67+V237_67+V247_67+V255_67+V296_67+V305_67+V323_67+V335_67)/22)*100
gen progsc68 = ((V41_68+V42_68+V50_68+V53_68+V54_68+V55_68+V62_68+V64_68+V86_68+V90_68+V94_68+V110_68+V111_68+V164_68+V166_68 )/15 )*100 
gen nawsa_60=0
replace nawsa_60 = 1 if membpc>=0.65

save "./MMC_shifts.dta" 





